Brief introduction for STATA users
Data Analytics Unit
Stata
R
Stata
ssc installR
install.packages()Base R:
# Loading CSV data
dataFrame <- read.csv("data.csv")
# Creating a new variable
dataFrame$age_square <- age^2
# Subsetting the data frame
dataFrame <- dataFrame[dataFrame$age_square >= 250, ]
# Merging
merge(dataFrame, aux_data, by = "id")Tidyverse:
Stata:
# Loading CSV data
import delimited "data.csv", clear
# Creating a new variable
gen age^2
# Subsetting the data frame
keep if age_square >= 250
# Merging
merge 1:1 id using "aux_data"
# Collapsing data
collapse (mean) age_square, by(country)Tidyverse:
Stata
program defineR
function()Stata
R
factors_data <- master_data.df %>%
select(year, country, starts_with("fct"), roli_index) %>%
filter(year == 2015 | year == 2022) %>% # This is the period that we are studying
dplyr::rename(factor_1 = fct1,
factor_2 = fct2,
factor_3 = fct3,
factor_4 = fct4,
factor_5 = fct5,
factor_6 = fct6,
factor_7 = fct7,
factor_8 = fct8) %>% # We reshape the data base to make easier the estimations of the rates
pivot_wider(id_cols = country, names_from = year,
values_from = c(factor_1, factor_2, factor_3, factor_4, factor_5,
factor_6, factor_7, factor_8, roli_index,
)) %>% # We estimate the rate changes between 2015 and 2022 per factor
mutate(differences_factor_1 = ((factor_1_2022 - factor_1_2015)/factor_1_2015),
differences_factor_2 = ((factor_2_2022 - factor_2_2015)/factor_2_2015),
differences_factor_3 = ((factor_3_2022 - factor_3_2015)/factor_3_2015),
differences_factor_4 = ((factor_4_2022 - factor_4_2015)/factor_4_2015),
differences_factor_5 = ((factor_5_2022 - factor_5_2015)/factor_5_2015),
differences_factor_6 = ((factor_6_2022 - factor_6_2015)/factor_6_2015),
differences_factor_7 = ((factor_7_2022 - factor_7_2015)/factor_7_2015),
differences_factor_8 = ((factor_8_2022 - factor_8_2015)/factor_8_2015),
differences_roli = (roli_index_2022 - roli_index_2015)/roli_index_2015) %>%
select(country, starts_with("differences"))We do some tests to define the optimal number of clusters. This is based on the independent information provided by each one.
fviz_nbclust(cluster_data, kmeans, method = 'wss') # According to the graph the main change
# is produced after the third cluster
fviz_nbclust(cluster_data, kmeans, method = 'silhouette') # According to this method the optimal
# number of cluster is twoWe constructed each cluster using the Hierarchical Clustering Method and determined the distribution pattern of each country.
res2 <- hcut(cluster_data, k = 3, stand = T, hc_metric = 'euclidean') # We use the package FactoMineRCarlos and Santiago